Preprocessing performed with Seurat and Scater. Output to that Rmd file, will visualize multiple settings for PCs to use.
## Parameters for processing dataset and metadata ##
## Cell-cycle scoring on genes:
CC_file = "../scrna/refs/CellCycle/TiroshRegev2016_CellCycleGenes.csv"
MT_genes_file = "../scrna/CMdiff_mon-EB-EHT_d4-26_all_s2s_ercc/genome_addons/MT_genes.txt"
# Chronological order of levels (for instance timepoints)
reorder_label <- "Timepoint"
order_levels <- c("d4" ,"d5","d7", "d8","d9", "d10","d11", "d12", "d13", "d14", "d18", "d21", "m5", "m11", "m12")
## Dimensionality reduction ##
# Settings for UMAP and clustering
elbow_dim_use = 55
PC_set = 25
res_set = 1.5
min_dist = 0.75
grouping_settings = c("coarse_seurat_clusters","Timepoint", "Lineage", "seurat_clusters", "Experiment")
w_umappatch = 12
h_umappatch = 12
DE_assay = "sf"
## Visualization ##
explore_violin = c("NKX2-5", "MYL7")
cc_goi = c("PCNA", "TOP2A", "MCM6", "MKI67")
selected_markers2 = c("GATA4","NKX2-5","ISL1","PDGFRA","RYR2","TNNT2","MYL7","PECAM1","MYL2","IRX4","COL3A1","MKI67")
other_featureprint = c("nFeature_sf","nCount_sf", "GFP", "mCherry","NKX2-5","NR2F2")
GOI_file = ""
no_of_DEgenes = 10
# Show marker heatmap with reordering of the clusters?
reorder_clusterlevels = FALSE
order_clusterlevels = c("0", "1", "2", "3", "4", "5", "6", "7", "8", "9", "10", "11", "12", "13", "14", "15", "16")
## Subsetting the dataset ##
# On CellCycle-state?
# Specify a specific (sub)string to select the cells on (selection happens on the colnames)
subset_var = "Phase"
subset_id = "G1"
# Select the type of filtering: keep cells with the substring (set "in") or remove (set "out")
# if NO filtering is wanted, leave empty (set "" or "no")
filtering = "no"
# Settings after filtering:
vars_to_regress = NULL #c("nCount_sf", "nFeature_sf") # If no regression desired: NULL
nHVG = 2000
sub_PC_set = 12
sub_res_set = 1.0
## Storing results ##
dataset_location <- "../snabel/scrna/CMdiff_mon-EB-EHT_combined/rerun_v4/subsets/EB/20211029_ExpInt_ClustUMAP_toPy_EB/seusetv4_integrated.rds"
workdir <- "../snabel/scrna/CMdiff_mon-EB-EHT_combined/rerun_v4/subsets/EB/"
# if regression is performed: this will already be included in the folder name
result_descript = "_ClusteringUMAP_toPy"
## Location of scripts ##
source("../R-scripts/scRNA-seq/feature_plot_pdf_fuction.R")
source("../R-scripts/scRNA-seq/Seurat_subset_rerun.R")
source("../snabel/R-scripts/scRNA-seq/seurat_to_matrices.R")
system(paste("mkdir -p ", workdir))
knitr::opts_knit$set(root.dir = normalizePath(workdir))
set.seed(42)
# ########## Installing important repositories ##########
# # ONLY NEEDED THE FIRST TIME LOADING THIS ENVIRONMENT #
#
# # Needed dependencies for transfer to python with H5AD type-convert:
# Sys.setenv(R_REMOTES_NO_ERRORS_FROM_WARNINGS="true")
# devtools::install_github(repo = "mojaveazure/loomR", ref = "develop")
# install.packages("remotes")
# remotes::install_github("mojaveazure/seurat-disk")
Normalized, scaled dataset with information on the highest variable genes (HVGs).
seuset <- readRDS(paste0(dataset_location))
experiment_table <- read.table("../scrna/CMdiff_mon-EB-EHT_combined/sample-experiment_table.txt", header = TRUE, row.names = 1)
seuset@meta.data$Experiment <- experiment_table[seuset@meta.data$combined_id,]
dateoftoday <- gsub("-", "", as.character(Sys.Date()))
resultsdir <- paste(workdir, dateoftoday, result_descript, sep = "")
system(paste("mkdir -p ", resultsdir))
knitr::opts_knit$set(root.dir = normalizePath(resultsdir))
Results will be stored in: resultsdir
seuset@meta.data[[reorder_label]] <- factor(seuset@meta.data[[reorder_label]], levels = order_levels)
ElbowPlot(seuset, ndims = elbow_dim_use)
Seurat v3 applies a graph-based clustering approach, in which the cells are embedded in a graph structure (for example a K-nearest neighbor graph), with edges between the cells with similar feature expression patterns (determined with euclidean distances in PCA space and overlap within local neighborhoods, as determined with Jaccard similarity). The cells are then tried to partition into quasi-cliques or communities.
Louvain clustering is used after, to iteratively group cells together
# Calculating the distances and finding the neighbors:
seuset <- FindNeighbors(seuset, dims = 1:PC_set)
## Computing nearest neighbor graph
## Computing SNN
seuset <- FindClusters(seuset, resolution = res_set)
## Modularity Optimizer version 1.3.0 by Ludo Waltman and Nees Jan van Eck
##
## Number of nodes: 5273
## Number of edges: 182598
##
## Running Louvain algorithm...
## Maximum modularity in 10 random starts: 0.8742
## Number of communities: 25
## Elapsed time: 0 seconds
head(Idents(seuset), 10)
## GRCh38_ERCC_reporter_EB_AM_d10_1_942_P8
## 5
## GRCh38_ERCC_reporter_EB_AM_d10_1_942_L15
## 1
## GRCh38_ERCC_reporter_EB_AM_d10_1_942_D1
## 3
## GRCh38_ERCC_reporter_EB_AM_d10_1_942_P14
## 1
## GRCh38_ERCC_reporter_EB_AM_d10_1_942_E9
## 3
## GRCh38_ERCC_reporter_EB_AM_d10_1_942_L11
## 3
## GRCh38_ERCC_reporter_EB_AM_d10_1_942_B12
## 3
## GRCh38_ERCC_reporter_EB_AM_d10_1_942_C12
## 0
## GRCh38_ERCC_reporter_EB_AM_d10_1_942_L14
## 0
## GRCh38_ERCC_reporter_EB_AM_d10_1_942_M2
## 1
## Levels: 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24
seuset <- RunUMAP(seuset, dims = 1:PC_set, min.dist = min_dist)
## Warning: The default method for RunUMAP has changed from calling Python UMAP via reticulate to the R-native UWOT using the cosine metric
## To use Python UMAP via reticulate, set umap.method to 'umap-learn' and metric to 'correlation'
## This message will be shown once per session
## 11:08:46 UMAP embedding parameters a = 0.2734 b = 1.622
## 11:08:46 Read 5273 rows and found 25 numeric columns
## 11:08:46 Using Annoy for neighbor search, n_neighbors = 30
## 11:08:46 Building Annoy index with metric = cosine, n_trees = 50
## 0% 10 20 30 40 50 60 70 80 90 100%
## [----|----|----|----|----|----|----|----|----|----|
## **************************************************|
## 11:08:47 Writing NN index file to temp file /scratch/snabel/tmp/oFX4/RtmprrUuvE/file2973183e54d5c9
## 11:08:47 Searching Annoy index using 1 thread, search_k = 3000
## 11:08:48 Annoy recall = 100%
## 11:08:48 Commencing smooth kNN distance calibration using 1 thread
## 11:08:49 Initializing from normalized Laplacian + noise
## 11:08:49 Commencing optimization for 500 epochs, with 215136 positive edges
## 11:08:54 Optimization finished
pdf(paste0("UMAPs_PC1-", PC_set, "_", res_set,"_clusters.pdf"), width = 10, height = 7)
DimPlot(seuset, reduction = "umap", group.by = "seurat_clusters", label = TRUE, combine = TRUE)
dev.off()
## png
## 2
pdf(paste0("UMAPs_PC1-", PC_set, "_original_coarse_clusters.pdf"), width = 10, height = 7)
DimPlot(seuset, reduction = "umap", group.by = "coarse_seurat_clusters", label = TRUE, combine = TRUE)
dev.off()
## png
## 2
“First, we assign each cell a score, based on its expression of G2/M and S phase markers. These marker sets should be anticorrelated in their expression levels, and cells expressing neither are likely not cycling and in G1 phase.” - Seurat
Using the list in Seurat´s tutorials from Tirosh for their CC correction.
# Generate cell cycle gene lists
ccgenes <- read.csv(CC_file, header = TRUE, sep = ";")
g2m.genes <- as.character(ccgenes$G2.M)
s.genes <- as.character(ccgenes$G1.S)
s.genes <- s.genes[!s.genes == ""]
s.genes <- s.genes[s.genes %in% rownames(seuset)]
g2m.genes <- g2m.genes[g2m.genes %in% rownames(seuset)]
# Running scoring and dimensionality reduction with the cell cycle gene list
seuset <- CellCycleScoring(seuset, s.features = s.genes, g2m.features = g2m.genes, set.ident = FALSE)
seuset.cc <- RunPCA(seuset, features = unique(c(s.genes, g2m.genes)), npcs = 10)
## PC_ 1
## Positive: UNG, CDCA7, ATAD2, CDC6, MCM6, UHRF1, HN1, GAS2L3, GINS2, MCM2
## BLM, DTL, MCM4, FEN1, RAD51, AURKA, NASP, TMPO, PCNA, HELLS
## NCAPD2, HMMR, CLSPN, BIRC5, CKAP2, BRIP1, CKAP2L, CKS2, CCNB2, CDCA3
## Negative: TOP2A, CENPF, NUSAP1, UBE2C, CDK1, MKI67, DLGAP5, TPX2, KIF2C, AURKB
## TTK, NUF2, HJURP, CENPE, ANLN, KIF11, KIF23, GTSE1, NDC80, TACC3
## HMGB2, CDCA8, CDCA2, RAD51AP1, CENPA, SMC4, RRM2, BUB1, CKS1B, KIF20B
## PC_ 2
## Positive: CENPA, AURKA, HMMR, MKI67, CDCA3, GAS2L3, CCNB2, BUB1, CENPE, TPX2
## DLGAP5, CKAP2L, UBE2C, CDCA8, GTSE1, FAM64A, AURKB, TACC3, NDC80, HJURP
## NCAPD2, KIF23, KIF2C, NUF2, TOP2A, CENPF, KIF11, TTK, BIRC5, CDCA2
## Negative: DTL, PCNA, MCM4, UHRF1, MCM6, CDC6, CLSPN, FEN1, UNG, MCM2
## CDCA7, GINS2, RAD51, HELLS, BLM, NASP, BRIP1, ATAD2, RAD51AP1, RRM2
## TMPO, SMC4, CDK1, KIF20B, CKS1B, HMGB2, HN1, CKS2, ANLN, CKAP2
## PC_ 3
## Positive: NDC80, AURKA, HJURP, ATAD2, KIF23, RRM2, CDK1, CKAP2L, CENPA, RAD51
## CDCA8, CDCA3, CDCA2, TTK, BRIP1, KIF2C, AURKB, CDC6, RAD51AP1, BLM
## CLSPN, KIF11, PCNA, NUSAP1, FEN1, DTL, UBE2C, ANLN, NUF2, GINS2
## Negative: CCNB2, NASP, NCAPD2, BIRC5, HMMR, CENPF, HMGB2, DLGAP5, CDCA7, HN1
## MCM6, TMPO, MCM2, CENPE, HELLS, CKS1B, KIF20B, TPX2, SMC4, GAS2L3
## MKI67, CKS2, CKAP2, FAM64A, TACC3, BUB1, UNG, MCM4, TOP2A, GTSE1
## PC_ 4
## Positive: HMGB2, ATAD2, GAS2L3, MKI67, NCAPD2, RRM2, HELLS, BLM, DLGAP5, RAD51AP1
## SMC4, TPX2, GTSE1, KIF20B, TOP2A, NUF2, BRIP1, KIF11, CENPF, TACC3
## RAD51, NUSAP1, CENPE, CLSPN, TMPO, ANLN, AURKB, CDCA2, BUB1, CKAP2L
## Negative: HN1, AURKA, CDCA3, CDCA7, CKAP2, UNG, CENPA, MCM6, FAM64A, UHRF1
## BIRC5, CDCA8, DTL, MCM2, HJURP, NASP, CKS2, PCNA, HMMR, MCM4
## CKS1B, NDC80, TTK, CCNB2, FEN1, KIF2C, KIF23, CDC6, GINS2, CDK1
## PC_ 5
## Positive: GAS2L3, ATAD2, CDC6, KIF20B, HMMR, CDCA7, DTL, AURKA, KIF11, NUF2
## CKAP2, MCM2, BRIP1, CLSPN, HN1, BUB1, KIF2C, UHRF1, CENPE, UNG
## CENPF, MKI67, SMC4, CDCA2, TTK, ANLN, MCM4, MCM6, NCAPD2, TPX2
## Negative: BLM, RAD51, TMPO, RRM2, CKS2, HJURP, FEN1, FAM64A, NASP, AURKB
## HMGB2, PCNA, NUSAP1, GTSE1, RAD51AP1, CCNB2, CDCA8, GINS2, NDC80, CDCA3
## HELLS, TOP2A, CDK1, CKAP2L, DLGAP5, CKS1B, CENPA, UBE2C, KIF23, BIRC5
# Visualizing the effect of cell cycle in the dataset
DimPlot(seuset.cc, reduction = "pca", group.by = "Phase")
DimPlot(seuset, group.by = "Phase")
RidgePlot(seuset, features = cc_goi, ncol = 2)
## Picking joint bandwidth of 0.169
## Picking joint bandwidth of 0.0746
## Picking joint bandwidth of 0.0934
## Picking joint bandwidth of 0.047
pl.list <- list()
for (i in c(grouping_settings, "Phase")){
print(i)
pl.list[[i]] <- DimPlot(seuset, reduction = "umap", group.by = i, combine = TRUE)
print(pl.list[[i]])
}
## [1] "coarse_seurat_clusters"
## [1] "Timepoint"
## [1] "Lineage"
## [1] "seurat_clusters"
## [1] "Experiment"
## [1] "Phase"
pdf(paste0("UMAPs_PC1-", PC_set, "_", res_set,".pdf"), width = w_umappatch, height = h_umappatch)
Reduce( `+`, pl.list ) +
patchwork::plot_layout( ncol = 2 )
dev.off()
## png
## 2
DefaultAssay(seuset) <- DE_assay
pdf(paste0("Featureplot_PC1-", PC_set, "_SelectedMarkers2.pdf"), width = 20, height = 15)
FeaturePlot(seuset, selected_markers2)
dev.off()
## png
## 2
pdf(paste0("Featureplot_PC1-", PC_set, paste(other_featureprint, collapse = "_"), ".pdf"), width = 20, height = 15)
FeaturePlot(seuset, other_featureprint)
## Warning in FetchData(object = object, vars = c(dims, "ident", features), : The
## following requested variables were not found: GFP
dev.off()
## png
## 2
feature_plot_pdfs(seuset)
## png
## 2
seuset.markers <- FindAllMarkers(seuset, only.pos =TRUE, min.pct = 0.25, logfc.threshold = 0.25)
## Calculating cluster 0
## Calculating cluster 1
## Calculating cluster 2
## Calculating cluster 3
## Calculating cluster 4
## Calculating cluster 5
## Calculating cluster 6
## Calculating cluster 7
## Calculating cluster 8
## Calculating cluster 9
## Calculating cluster 10
## Calculating cluster 11
## Calculating cluster 12
## Calculating cluster 13
## Calculating cluster 14
## Calculating cluster 15
## Calculating cluster 16
## Calculating cluster 17
## Calculating cluster 18
## Calculating cluster 19
## Calculating cluster 20
## Calculating cluster 21
## Calculating cluster 22
## Calculating cluster 23
## Calculating cluster 24
write.csv(seuset.markers, paste0("all_seuset_markers_PC1-", PC_set,"_res", res_set,".csv"), quote = FALSE)
seuset <- ScaleData(object = seuset,
vars.to.regress = vars_to_regress,
assay = DE_assay,
features = rownames(seuset))
## Centering and scaling data matrix
top_markers <- seuset.markers %>% group_by(cluster) %>% top_n(n = no_of_DEgenes, wt = avg_log2FC)
DoHeatmap(seuset, features = top_markers$gene, group.by = "seurat_clusters") + NoLegend()
pdf(paste0("ClusterHeatmapTop", no_of_DEgenes, "_PC1-", PC_set,"_res", res_set, "_UMAP.pdf"), width = 20, height = 20)
DoHeatmap(seuset, features = top_markers$gene, group.by = "seurat_clusters") + NoLegend() +
theme(text = element_text(size = 8))
dev.off()
## png
## 2
if (reorder_clusterlevels == TRUE){
# Set the order of clusters for visualization:
old_levels <- levels(seuset$seurat_clusters)
levels(seuset$seurat_clusters) <- order_clusterlevels
pdf(paste0("ClusterHeatmapTop", no_of_DEgenes, "_PC1-", PC_set,"_res", res_set, "_UMAP_reordered.pdf"), width = 20, height = 20)
DoHeatmap(seuset, features = top_markers$gene, group.by = "seurat_clusters") + NoLegend() + theme(text = element_text(size = 8))
dev.off()
# # reset level order:
#levels(seuset$seurat_clusters) <- old_levels
}
saveRDS(seuset, paste0("seusetv4_PC1-", PC_set, "_res", res_set, ".rds" ))
seurat_to_matrices(seurat_object = seuset, output_folder = resultsdir, integration_object = TRUE)
##
## Attaching package: 'Matrix'
## The following objects are masked from 'package:tidyr':
##
## expand, pack, unpack
## [1] "Generating files at: ../snabel/scrna/CMdiff_mon-EB-EHT_combined/rerun_v4/subsets/EB/20211029_ClusteringUMAP_toPy/ for assay: sf, a counts.mtx and scaled_counts.csv"
## [1] "Generating files at: ../snabel/scrna/CMdiff_mon-EB-EHT_combined/rerun_v4/subsets/EB/20211029_ClusteringUMAP_toPy/ for assay: uf, a counts.mtx and scaled_counts.csv"
## [1] "Generating files at: ../snabel/scrna/CMdiff_mon-EB-EHT_combined/rerun_v4/subsets/EB/20211029_ClusteringUMAP_toPy/ for assay: integrated, a counts.mtx and scaled_counts.csv"
sessionInfo()
## R version 4.1.0 (2021-05-18)
## Platform: x86_64-conda-linux-gnu (64-bit)
## Running under: Ubuntu 20.04.3 LTS
##
## Matrix products: default
## BLAS/LAPACK: /vol/mbconda/snabel/anaconda3/envs/kb_scrna_R_mon3/lib/libopenblasp-r0.3.17.so
##
## locale:
## [1] LC_CTYPE=en_US.UTF-8 LC_NUMERIC=C
## [3] LC_TIME=en_US.UTF-8 LC_COLLATE=en_US.UTF-8
## [5] LC_MONETARY=en_US.UTF-8 LC_MESSAGES=en_US.UTF-8
## [7] LC_PAPER=en_US.UTF-8 LC_NAME=C
## [9] LC_ADDRESS=C LC_TELEPHONE=C
## [11] LC_MEASUREMENT=en_US.UTF-8 LC_IDENTIFICATION=C
##
## attached base packages:
## [1] stats graphics grDevices utils datasets methods base
##
## other attached packages:
## [1] Matrix_1.3-4 scales_1.1.1 RColorBrewer_1.1-2 SeuratObject_4.0.2
## [5] Seurat_4.0.4 knitr_1.33 tidyr_1.1.3 dplyr_1.0.7
## [9] ggplot2_3.3.5
##
## loaded via a namespace (and not attached):
## [1] Rtsne_0.15 colorspace_2.0-2 deldir_0.2-10
## [4] ellipsis_0.3.2 ggridges_0.5.3 spatstat.data_2.1-0
## [7] farver_2.1.0 leiden_0.3.9 listenv_0.8.0
## [10] ggrepel_0.9.1 RSpectra_0.16-0 fansi_0.5.0
## [13] codetools_0.2-18 splines_4.1.0 polyclip_1.10-0
## [16] jsonlite_1.7.2 ica_1.0-2 cluster_2.1.2
## [19] png_0.1-7 uwot_0.1.10 shiny_1.6.0
## [22] sctransform_0.3.2 spatstat.sparse_2.0-0 compiler_4.1.0
## [25] httr_1.4.2 assertthat_0.2.1 fastmap_1.1.0
## [28] lazyeval_0.2.2 limma_3.48.3 later_1.3.0
## [31] htmltools_0.5.2 tools_4.1.0 igraph_1.2.6
## [34] gtable_0.3.0 glue_1.4.2 RANN_2.6.1
## [37] reshape2_1.4.4 Rcpp_1.0.7 scattermore_0.7
## [40] jquerylib_0.1.4 vctrs_0.3.8 nlme_3.1-152
## [43] lmtest_0.9-38 xfun_0.25 stringr_1.4.0
## [46] globals_0.14.0 mime_0.11 miniUI_0.1.1.1
## [49] lifecycle_1.0.0 irlba_2.3.3 goftest_1.2-2
## [52] future_1.22.1 MASS_7.3-54 zoo_1.8-9
## [55] spatstat.core_2.3-0 promises_1.2.0.1 spatstat.utils_2.2-0
## [58] parallel_4.1.0 yaml_2.2.1 reticulate_1.20
## [61] pbapply_1.4-3 gridExtra_2.3 sass_0.4.0
## [64] rpart_4.1-15 stringi_1.7.4 highr_0.9
## [67] rlang_0.4.11 pkgconfig_2.0.3 matrixStats_0.60.1
## [70] evaluate_0.14 lattice_0.20-44 ROCR_1.0-11
## [73] purrr_0.3.4 tensor_1.5 labeling_0.4.2
## [76] patchwork_1.1.1 htmlwidgets_1.5.3 cowplot_1.1.1
## [79] tidyselect_1.1.1 parallelly_1.27.0 RcppAnnoy_0.0.19
## [82] plyr_1.8.6 magrittr_2.0.1 R6_2.5.1
## [85] generics_0.1.0 DBI_1.1.1 pillar_1.6.2
## [88] withr_2.4.2 mgcv_1.8-36 fitdistrplus_1.1-5
## [91] survival_3.2-13 abind_1.4-5 tibble_3.1.4
## [94] future.apply_1.8.1 crayon_1.4.1 KernSmooth_2.23-20
## [97] utf8_1.2.2 spatstat.geom_2.2-2 plotly_4.9.4.1
## [100] rmarkdown_2.10 grid_4.1.0 data.table_1.14.0
## [103] digest_0.6.27 xtable_1.8-4 httpuv_1.6.2
## [106] munsell_0.5.0 viridisLite_0.4.0 bslib_0.3.0